Input data and configuration

# get default parameters the papermill way.
input_file = "results/04_annotate_cell_types/adata.h5ad"
output_file = "results/05_prepare_de/adata.h5ad"
output_file_obs = "results/05_prepare_de/adata_obs.tsv"
results_dir = "results/05_prepare_de"
cpus = 32
# Parameters
input_file = "input_adata.h5ad"
output_file = "adata.h5ad"
output_file_obs = "adata_obs.tsv"
cpus = "32"
results_dir = "."
import pandas as pd
import scanpy as sc
import numpy as np
from matplotlib import pyplot as plt
from collections import OrderedDict
import os
import sys
import gc

sys.path.append("lib")
sys.path.append("../lib")

from jupytertools import setwd, fix_logging, display

from toolz.functoolz import pipe, partial

from multiprocessing import Pool
import seaborn as sns
from plotnine import ggplot, aes
import plotnine as n
import scipy.stats as stats
import itertools

setwd()
fix_logging(sc.settings)
Working directory did not change because calling from nextflow. 
# setup R integration
import rpy2.rinterface_lib.callbacks
import anndata2ri
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython
%%R
library(conflicted)
conflict_prefer("Position", "base")
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggbeeswarm)
library(edgeR)
options(max.print=100)
options(repr.matrix.max.cols=50, repr.matrix.max.rows=6)
R[write to console]: [conflicted] Will prefer base::Position over any other package

R[write to console]: Loading required package: magrittr

R[write to console]: Loading required package: limma

markers = pd.read_csv("tables/cell_type_markers.csv")
adata = sc.read_h5ad(input_file)

Analysis of T cells

adata.obs.columns
Index(['batch', 'samples', 'patient', 'origin', 'replicate', 'dataset',
       'tumor_type', 'platform', 'age', 'sex', 'facs_purity_cd3',
       'facs_purity_cd56', 'n_genes', 'mt_frac', 'n_counts', 'rk_n_counts',
       'rk_n_genes', 'rk_mt_frac', 'doublet_score', 'is_doublet', 'leiden',
       'S_score', 'G2M_score', 'phase', 'cell_type', 'cell_type_unknown',
       'cell_type_coarse'],
      dtype='object')
# subset to T cells
adata_all = adata
mask = adata.obs["cell_type_coarse"].isin(["T cell", "NK cell"])
adata = adata[mask, :].copy()
sc.pl.umap(adata, color="cell_type")

Redo neighbors, umap, clustering

adata = sc.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, raw=adata.raw)
adata.uns["norm_log"] = True
sc.pp.pca(adata)
sc.pp.neighbors(adata, n_neighbors=10)
sc.tl.umap(adata)
Note that scikit-learn's randomized PCA might not be exactly reproducible across different computational platforms. For exact reproducibility, choose `svd_solver='arpack'.` This will likely become the Scanpy default in the future.
computing PCA with n_comps = 50
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished
sc.pl.umap(adata, color=["samples", "cell_type"])
sc.pl.umap(adata, color=["CD4", "CD8A", "FOXP3", "PDCD1", "KLRF1"])

inspect and correct for batch effects

  • Using combat resulted in a blob with a lot of signal lost.
  • The patients are admixed fairly well already (and will improve further after HVG filtering (see below))
  • I don't use combat here therefore.

(using combat with covariates did not work out -> singular matrix, i.e. too few samples per group)

HVG-filtering

  • variable genes reduced to 3000, as we are only dealing with cells of the same major type now.
sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=3000)
sc.pl.highly_variable_genes(adata)
extracting highly variable genes
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

redo neighbors and clustering after HVG filtering.

  • patient admixture looks a lot better now
  • batch effects appear not to be a major issue.
sc.pp.pca(adata, svd_solver="arpack")
sc.pp.neighbors(adata, n_neighbors=10)
sc.tl.umap(adata)
computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished
sc.pl.umap(adata, color=["samples", "cell_type"])
sc.pl.umap(adata, color=["CD4", "CD8A", "FOXP3", "PDCD1", "KLRF1"])

Do clustering for all cell types individually

adatas = {
    "CD4": adata[
        adata.obs["cell_type"]
        .isin(["T cell CD4+ non-regulatory", "T cell regulatory"])
        .tolist(),
        :,
    ].copy(),
    "CD8": adata[(adata.obs["cell_type"] == "T cell CD8+").tolist(), :].copy(),
    "NK": adata[(adata.obs["cell_type"] == "NK cell").tolist(), :].copy(),
}
for ct, tmp_adata in adatas.items():
    print("###########################\n{}\n###########################\n\n".format(ct))
    sc.pp.highly_variable_genes(tmp_adata, flavor="cell_ranger", n_top_genes=2000)
    sc.pl.highly_variable_genes(tmp_adata)
###########################
CD4
###########################


extracting highly variable genes
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

###########################
CD8
###########################


extracting highly variable genes
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

###########################
NK
###########################


extracting highly variable genes
/home/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py:135: RuntimeWarning: divide by zero encountered in true_divide
  ) / disp_mad_bin[df['mean_bin'].values].values
/home/sturm/scratch/projects/2020/kortekaas2020_paper/work/conda/run_notebook-b99176dfb05e0522988294512373169f/lib/python3.7/site-packages/scanpy/preprocessing/_highly_variable_genes.py:135: RuntimeWarning: invalid value encountered in true_divide
  ) / disp_mad_bin[df['mean_bin'].values].values
    finished
added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)

for ct, tmp_adata in adatas.items():
    print("###########################\n{}\n###########################\n\n".format(ct))
    sc.pp.pca(tmp_adata, svd_solver="arpack")
    sc.pp.neighbors(tmp_adata, n_neighbors=10)
    sc.tl.umap(tmp_adata)
    sc.pl.umap(tmp_adata, color=["samples", "cell_type"])
###########################
CD4
###########################


computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished

###########################
CD8
###########################


computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished

###########################
NK
###########################


computing PCA with n_comps = 50
computing PCA on highly variable genes
    finished
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished
computing UMAP
    finished

def leiden_with_r(adata_key, r, seed):
    key = "leiden_{:.3f}".format(r)
    sc.tl.leiden(adatas[adata_key], resolution=r, key_added=key, random_state=seed)
    return adatas[adata_key].obs[key]
def test_leiden_thresholds(adata_key, resolutions, seeds, n_cpus):
    """
    Test different leiden thresholds.

    Args:
        adata:
        resolutions: numpy array containing all resolutions to test
        seeds: numpy array containin random seeds (every resolution is tested with every seed)
        p: multiprocessing.Pool
    """
    args = list(itertools.product([adata_key], resolutions, seeds))
    #     p = lambda: None
    #     p.starmap = lambda x, a: [x for x in itertools.starmap(x, a)]
    leiden_results = p.starmap(leiden_with_r, args)
    n_clusters = [leiden_results[i].cat.categories.size for i, _ in enumerate(args)]
    # re-arrange results in dataframe and aggregate by mean.
    clusters = {s: dict() for s in seeds}

    for i, (a, r, s) in enumerate(args):
        clusters[s][r] = n_clusters[i]

    clusters_mean = np.mean(pd.DataFrame.from_dict(clusters).values, axis=1)

    return clusters_mean
%%capture
p = Pool(int(cpus))
leiden_thres = {
    "CD4": 0.2,
    "CD8": 0.2,
    "NK": 0.2,
}
# test_leiden_thresholds("NK", resolutions=np.arange(0.1, 0.2, 0.05), seeds=np.arange(0, 3), n_cpus=1)
resolutions = np.arange(0.1, 1.5, 0.05)
seeds = np.arange(0, 10)
for ct, tmp_adata in adatas.items():
    print("###########################\n{}\n###########################\n\n".format(ct))
    clusters_mean = test_leiden_thresholds(ct, resolutions, seeds, 16)
    plt.plot(resolutions, clusters_mean)
    # plt.plot(resolutions, pd.Series(clusters_mean).rolling(3), color="red")
    plt.xlabel("leiden resolution")
    plt.ylabel("#clusters")
    plt.vlines(x=leiden_thres[ct], ymin=0, ymax=plt.ylim()[1], color="grey")
    plt.show()
###########################
CD4
###########################


###########################
CD8
###########################


###########################
NK
###########################


There is no clear plateau... anyway, 10 clusters sounds reasonable.

for ct, tmp_adata in adatas.items():
    print("###########################\n{}\n###########################\n\n".format(ct))
    sc.tl.leiden(tmp_adata, resolution=leiden_thres[ct])
    sc.pl.umap(tmp_adata, color="leiden", legend_loc="on data")
    sc.tl.rank_genes_groups(tmp_adata, groupby="leiden")
    sc.pl.rank_genes_groups(tmp_adata)
###########################
CD4
###########################


running Leiden clustering
    finished

ranking genes
    finished

###########################
CD8
###########################


running Leiden clustering
    finished

ranking genes
    finished

###########################
NK
###########################


running Leiden clustering
    finished

ranking genes
    finished

Write output adata.h5ad

for ct, tmp_adata in adatas.items():
    adata.obs.loc[tmp_adata.obs.index, "cluster"] = [
        "{}_{}".format(ct, x) for x in tmp_adata.obs["leiden"]
    ]
sc.pl.umap(adata, color=["cluster", "cell_type"], legend_loc="on data")
... storing 'cluster' as categorical

adata.write_h5ad(output_file, compression="lzf")
adata.obs.to_csv(output_file_obs, sep="\t")